Loading and arranging the data

Setup:

knitr::opts_chunk$set(echo = TRUE, max.print = 100)

Load libraries:

library(readxl)
library(data.table)
library(tidytable)
## As of tidytable v0.9.0 dotless versions of functions are exported.
## You can now use `arrange()`/`mutate()`/etc. directly.
## 
## Attaching package: 'tidytable'
## The following objects are masked from 'package:data.table':
## 
##     between, first, fread, last
## The following objects are masked from 'package:stats':
## 
##     dt, filter, lag
## The following object is masked from 'package:base':
## 
##     %in%
library(stringr)
library(stringi)
library(Cairo)
library(geosphere)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:tidytable':
## 
##     arrange, distinct, filter, group_by, mutate, rename, select, slice,
##     summarise, transmute, ungroup
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(geodata)
## Loading required package: terra
## terra 1.5.21
## 
## Attaching package: 'terra'
## The following object is masked from 'package:ggplot2':
## 
##     arrow
## The following object is masked from 'package:tidytable':
## 
##     extract
## The following object is masked from 'package:data.table':
## 
##     shift
library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:tidytable':
## 
##     arrange, count, desc, mutate, rename, summarise, summarize
library(tibble)
## 
## Attaching package: 'tibble'
## The following object is masked from 'package:tidytable':
## 
##     enframe
library(forcats)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following objects are masked from 'package:tidytable':
## 
##     map, map_chr, map_dbl, map_df, map_dfc, map_dfr, map_int, map_lgl,
##     map2, map2_chr, map2_dbl, map2_df, map2_dfc, map2_dfr, map2_int,
##     map2_lgl, pmap, pmap_chr, pmap_dbl, pmap_df, pmap_dfc, pmap_dfr,
##     pmap_int, pmap_lgl, walk
## The following object is masked from 'package:data.table':
## 
##     transpose
library(Rfast)
## Loading required package: Rcpp
## Loading required package: RcppZiggurat
## 
## Attaching package: 'Rfast'
## The following objects are masked from 'package:purrr':
## 
##     is_integer, transpose
## The following object is masked from 'package:tidytable':
## 
##     nth
## The following object is masked from 'package:data.table':
## 
##     transpose

Load Phonotacticon:

Phonotacticon <- read_xlsx("Phonotacticon.xlsx") %>% 
  as.data.table()

Phonotacticon

Load PhonoBib:

PhonoBib <- read_xlsx("PhonoBib.xlsx") %>% 
  as.data.table()

PhonoBib

Load PanPhon and sort out duplicate rows:

PanPhon <- fread("PanPhonPhonotacticon1_0.csv") %>% 
  as.data.table() %>% 
  unique(by = 'ipa')

PanPhon

Arrange PanPhon segments in alphabetical order:

PanPhonOrder <- PanPhon$ipa[
                  order(-nchar(PanPhon$ipa), 
                      PanPhon$ipa)]

head(PanPhonOrder)
## [1] "h͡d̪͡ɮ̪ʲʷ" "h͡d̪͡ɮ̪ʷː" "h͡d̪͡ɮ̪ʷˠ" "h͡d̪͡ɮ̪ʷˤ" "h͡d̪͡z̪ʲʷ" "h͡d̪͡z̪ʷː"

Create PanPhon regex:

PanPhonRegex <- paste0("(?:",
                       paste(PanPhonOrder, collapse="|"),
                      '|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
                       ")")

str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."

Create PanPhon regex including brackets:

PanPhonRegexBrackets <- paste0('(?:',
                               '(?<=\\[).*?(?=\\])|',
                               paste(PanPhonOrder, collapse="|"),
                               '|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
                               ')')

str_trunc(PanPhonRegexBrackets, 100)
## [1] "(?:(?<=\\[).*?(?=\\])|h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪..."

Define classes:

Classes <- PanPhon %>% 
  mutate(B = cons == 1 & lab == 1,
        C = cons == 1,
        Č = cons == 1 & delrel == 1 & son == -1,
        `F` = cons == 1 & cont == 1 & son == -1,
        G = grepl('j|w|ɥ|ɰ', ipa),
        Ł = cons == 1 & cor == 1 & lat == 1,
        L = cons == 1 & cont == 1 & cor == 1 & son == 1,
        N = nas == 1 & syl == -1,
        O = cont == 1 & son == 1 & syl == -1 & !grepl('h|ɦ', ipa),
        P = cons == 1 & cont == -1 & delrel == -1 & son == -1,
        R = cons == 1 & cont == 1 & cor == 1 & lat == -1 & son == 1,
        S = cons == 1 & cont == 1 & cor == 1 & son == -1,
        `T` = cons == 1 & son != 1,
        V = cons == -1 & cont == 1 & son == 1 & syl == 1,
        W = syl == -1 & voi == 1,
        X = syl == -1 & voi == -1,
        Z = cont == 1 & syl == -1) %>% 
  select(ipa, B, C, Č, `F`, G, Ł, L, N, O, P, R, S, `T`, V, W, X, Z) %>%
  melt(id.vars = 'ipa',
     variable.name = 'Class',
     value.name = 'Value') %>% 
  filter(Value) %>% 
  select(-Value)

Classes

Subset Eurasian lects that are complete in Phonotacticon:

Eurasia <- Phonotacticon %>% 
  filter(Macroarea == 'Eurasia',
  Complete, 
  complete.cases(P), 
         O != '?', 
         N != '?',
         C != '?',
         !grepl('C{2,}', O), 
         !grepl('C{2,}', C),
         !grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', O),
         !grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', C)) %>% 
  select(-Note, -Complete, -Macroarea)

Eurasia[1,]

Extract phonemes:

Phonemes <- stri_extract_all_regex(Eurasia$P, 
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>% 
  as.data.table() %>% 
  mutate(Lect = Eurasia$Lect) %>% 
  melt(id.vars = 'Lect',
       variable.name = 'Number',
       value.name = 'ipa') %>% 
  select(-Number) %>% 
  filter(ipa != '')

Phonemes

Subset lect, onsets, nuclei, and codas:

LectONC <- Eurasia %>% 
  select(Lect, O, N, C) %>% 
  melt(id.vars = 'Lect',
       variable.name = 'Category',
       value.name = 'Sequence')

LectONC

Extract the sequences:

Sequences <- LectONC[, tstrsplit(Sequence, ' ', fixed = FALSE)] %>% 
  mutate(Lect = LectONC$Lect,
         Category = LectONC$Category) %>% 
  melt(id.vars = c('Lect', 'Category'),
       variable.name = 'Number',
       value.name = 'Sequence') %>% 
  select(-Number) %>% 
  filter(!is.na(Sequence)) %>% 
  distinct()

Sequences

Subset sequences including capital letters:

Capitals <- 
  Sequences %>% 
  filter(grepl('B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z', Sequence)) %>% 
  select(-Category) %>% 
  distinct()

Capitals

Convert the capitals into the corresponding phonemes:

Decapitalized <- 
  stri_extract_all_regex(Capitals$Sequence,
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>% 
  as.data.table() %>% 
  mutate(Lect = Capitals$Lect,
         Sequence = Capitals$Sequence) %>% 
  melt(id.vars = c('Lect', 'Sequence'),
        variable.name = 'Order',
       value.name = 'Segment') %>%
  mutate(Order = Order %>% 
           as.factor() %>% 
           as.integer()) %>% 
  filter(Segment != '') %>% 
  left_join(Classes, by = c('Segment' = 'Class')) %>% 
  mutate(ipa = if_else(is.na(ipa), Segment, ipa)) %>% 
  select(-Segment) %>% 
  inner_join(Phonemes) %>% 
  setorder(col = Order) %>% 
  split(by = c('Lect', 'Sequence')) %>% 
  lapply(function(x)
           split(x, by = 'Order')) %>% 
  lapply(function(x)
    lapply(x, function(x)
      x <- x$ipa)) %>% 
  lapply(function(x)
    expand.grid(x) %>%
      do.call(what = paste0)) %>% 
  enframe() %>% 
  unnest() %>% 
  as.data.table() %>% 
  separate(col = name,
           into = c('Lect', 'Sequence'),
           sep = '\\.') %>% 
  setnames('value', 'NewSequence') %>% 
  full_join(Sequences) %>% 
  mutate(Sequence =
           if_else(!is.na(NewSequence),
                   NewSequence,
                   Sequence)) %>% 
  select(-NewSequence)

Decapitalized

Split the sequences into segments:

ToUnbracket <- stri_extract_all_regex(Decapitalized$Sequence, 
                         pattern = PanPhonRegexBrackets,
                         simplify = TRUE) %>% 
  as.data.table() %>% 
  mutate(Lect = Decapitalized$Lect,
         Category = Decapitalized$Category,
         Sequence = Decapitalized$Sequence) %>% 
  melt(id = c('Lect', 'Category', 'Sequence'),
       variable.name = 'Order',
       value.name = 'ipa') %>% 
  mutate(Order = Order %>% 
           as.factor() %>% 
           as.integer()) %>% 
  filter(ipa != "")

ToUnbracket

Subset bracketed sequences:

Bracketed <- ToUnbracket %>%
  filter(grepl('\\[', Sequence))

Bracketed

Convert the bracketed sequences into all possible sequences:

Unbracketed <- Bracketed$ipa %>% 
  stri_extract_all_regex(pattern = PanPhonRegex, simplify = TRUE) %>% 
  as.data.table() %>% 
  mutate(Sequence = Bracketed$Sequence,
         Order = Bracketed$Order) %>% 
  melt(id.vars = c('Sequence', 'Order'),
       variable.name = 'Number',
       value.name = 'ipa') %>%
  filter(ipa != '') %>% 
  select(-Number) %>% 
  setorder(col = Order) %>% 
  split(by = 'Sequence') %>% 
  lapply(function(x)
    split(x, by = 'Order')) %>% 
  lapply(function(x)
    lapply(x, function(x)
      x <- x$ipa)) %>% 
  lapply(function(x)
    expand.grid(x) %>%
      do.call(what = paste0)) %>% 
  enframe() %>% 
  unnest() %>% 
  setnames(c('name', 'value'),
           c('Sequence', 'NewSequence'))
  
Unbracketed

Split the sequences into segments:

Segments <- 
  stri_extract_all_regex(
  Unbracketed$NewSequence,
  pattern = PanPhonRegex,
  simplify = TRUE) %>% 
  as.data.table() %>% 
  mutate(Sequence = Unbracketed$Sequence,
         NewSequence = Unbracketed$NewSequence) %>% 
    pivot_longer(cols = -c(Sequence, NewSequence),
               names_to = 'Order',
               values_to = 'NewIPA') %>% 
    mutate(Order = Order %>% 
           as.factor() %>% 
           as.integer()) %>% 
  filter(NewIPA != '') %>% 
  full_join(ToUnbracket) %>% 
  mutate(Sequence = 
           if_else(
             !is.na(NewSequence),
             NewSequence,
             Sequence),
         ipa = 
           if_else(
             !is.na(NewIPA),
             NewIPA,
             ipa)) %>% 
  select(-NewSequence, -NewIPA)

Segments

Measure the length of each sequence:

Sequences_length <- 
  Segments[, .(Length = max(Order)), .(Lect, Category, Sequence)]

Sequences_length

Join the length of each sequence to segments:

Segments <- left_join(Segments, Sequences_length)

Segments

Count the maximal length:

MaxLength <- max(Sequences_length$Length)

MaxLength
## [1] 6

Count the number of split segments:

Segments_number <- nrow(Segments)

Segments_number
## [1] 44593

Assign different positions to each sequence:

Sequences_rep <- bind_rows(rep(list(Segments), MaxLength))

Sequences_rep <- Sequences_rep %>% 
  mutate(Position = rep(0:(MaxLength - 1),
                        each = Segments_number)) %>% 
  mutate(Order = Order + Position) %>% 
  filter(Length + Position <= MaxLength) %>% 
  select(-Length)

Sequences_rep

Calculating the phonological distance

Join segments with their phonological features:

Sequences_features <- Sequences_rep %>% 
  left_join(PanPhon, by = 'ipa') %>% 
               melt(id = c('Lect', 
                         'Category', 
                         'Sequence', 
                         'Order', 
                         'ipa', 
                         'Position'),
               variable.name = 'Feature',
               value.name = 'Value') %>% 
  mutate(Feature = paste0(Feature, Order)) %>% 
  dcast(Lect + Category + Sequence + Position ~ Feature,
        value.var = 'Value',
        fun.aggregate = sum,
        fill = 0) %>%
  mutate(SequencePosition = paste0(Sequence, Position)) %>% 
  select(-Lect, -Category, -Position, -Sequence) %>% 
  distinct()

Sequences_features

Calculate the distance between sequences:

Sequences_distance <- Sequences_features %>% 
  select(-SequencePosition) %>% 
  Dist() %>%
  as.data.table()

Sequences_distance[1:5, 1:5]

Define a vector of all sequences:

SequenceVectors <- 
  str_replace(Sequences_features$SequencePosition, '[0-9]', '')

head(SequenceVectors)
## [1] "bl" "bl" "bl" "bl" "bl" "d"

Name the rows and the columns of the distance matrix:

setnames(Sequences_distance, SequenceVectors)

Sequences_distance[, Sequence.x := SequenceVectors]

Sequences_distance[1:5, 1:5]

Calculate the minimal distance per each sequence pair:

Sequences_MinDistance <- 
  Sequences_distance %>% 
  melt(id.vars = 'Sequence.x',
     variable.name = 'Sequence.y',
     value.name = 'Distance') %>% 
  as.data.table() %>% 
  .[, .(Distance = min(Distance)),
     by = .(Sequence.x, Sequence.y)]

Sequences_MinDistance

Subset the distance of /pl/ and other segments:

pl <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'pl',
         Sequence.y != '∅') %>% 
  arrange(Distance)

pl

Subset the distance of /ia/ and other segments:

ia <- Sequences_MinDistance %>% 
  filter(Sequence.x == 'ia',
         Sequence.y != '∅') %>% 
  arrange(Distance)

ia

Define new sequences with new (decapitalized and unbracketed) sequences:

NewSequences <- Segments %>% 
  select(Lect, Category, Sequence) %>% 
  as.data.table()

NewSequences

Compare the difference between two lects within the same category:

ONC_distance <- NewSequences %>% 
  left_join(NewSequences, by = 'Category') %>% 
  left_join(Sequences_MinDistance, 
            by = c('Sequence.x', 'Sequence.y')) %>% 
  .[, .(Distance = min(Distance)),
              by = .(Lect.x, Lect.y, Sequence.x, Category)] %>% 
  .[, .(Distance = mean(Distance)), 
              by = .(Lect.x, Lect.y, Category)] %>%
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y), 
                      'vs.', 
                      pmax(Lect.x, Lect.y),
                      sep = ' ')) %>% 
  .[, .(Distance = max(Distance)), 
               by = .(Lect_vs_Lect, Category)] %>% 
   dcast(., Lect_vs_Lect ~ Category, value.var = 'Distance')
  
ONC_distance

Count the number of tones in each lect:

Tones <- Eurasia %>%
  select(Lect, `T`) %>%
  mutate(`T` = gsub("\\-", NA, `T`)) %>%
  mutate(`T` = str_count(`T`, " ") + 1)

Tones[is.na(Tones$`T`),]$`T` <- 0

Tones

Calculate the Canberra distance between the numbers of tones of each pair of lects:

Tones_distance <- dist(Tones, method = 'canberra') %>% 
  as.matrix() %>%
  as.data.table() %>% 
  setnames(Tones$Lect) %>% 
  mutate(Lect = Tones$Lect) %>% 
  melt(id = 'Lect', 
       variable.name = 'Lect2', 
       value.name = 'T') %>%
  mutate(`T` = replace_na(`T`, 0)) %>% 
  mutate(Lect_vs_Lect = str_c(pmin(as.character(Lect), as.character(Lect2)),
                                'vs.',
                                pmax(as.character(Lect), as.character(Lect2)),
                               sep = ' ')) %>%
  select(Lect_vs_Lect, `T`) %>%
  distinct()
## Warning in dist(Tones, method = "canberra"): NAs introduced by coercion
Tones_distance

Join segmental distance with tonal distance:

ONCT_distance <- ONC_distance %>% 
  full_join(Tones_distance)

ONCT_distance

Calculate the overall distance (PhonoDist):

PhonoDist <- ONCT_distance %>% 
  select(-Lect_vs_Lect) %>% 
  scale() %>% 
  as.data.table() %>% 
  mutate(Lect_vs_Lect = ONCT_distance$Lect_vs_Lect) %>% 
  mutate(Distance = sqrt((O - min(O)) ^ 2 +
                         (N - min(N)) ^ 2 +
                         (C - min(C)) ^ 2 +
                         (`T` - min(`T`)) ^ 2)) %>% 
  select(Lect_vs_Lect, Distance)

PhonoDist

Split the Lect_vs_Lect column of PhonoDist into Lect.x and Lect.y:

Lect_vs_Lect <- str_split_fixed(
  PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>% 
  as.data.table() %>% 
  setnames(c('Lect.x', 'Lect.y'))

Lect_vs_Lect

Visualize the phonological distances via multidimensional scaling:

PhonoScale <- PhonoDist %>% 
  bind_cols(Lect_vs_Lect) %>% 
  select(-Lect_vs_Lect) %>% 
  dcast(Lect.x ~ Lect.y, value.var = 'Distance') %>% 
  column_to_rownames('Lect.x') %>% 
  t() %>% 
  as.dist() %>% 
  cmdscale(k = nrow(Eurasia) - 1) %>%
  as.data.frame() %>% 
  rownames_to_column('Lect') %>% 
  as.data.table()
## Warning in cmdscale(., k = nrow(Eurasia) - 1): only 123 of the first 208
## eigenvalues are > 0
PhonoScale[1:10, 1:10]

Cluster PhonoScale into the two groups:

K2 <- PhonoScale %>% 
  select(-Lect) %>% 
  kmeans(2) %>% 
  pluck(1) %>% 
  as_factor()

K2
##   [1] 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 2 1 2 1 1 2 1 2 2 2 1 1 1 1 2
##  [38] 1 2 1 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 2 2 1 1 1 2 1 1 2 1 2 1 1
##  [75] 1 2 1 1 1 2 2 1 1 2 1 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 1 1 1 1
## [112] 1 2 1 2 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 2 1 2 1 2 2 1 1 1 1 2 1 2
## [149] 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 1
## [186] 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 1 2 2 2 2 2 1 2
## Levels: 1 2

Cluster PhonoScale into the five groups:

K5 <- PhonoScale %>% 
  select(-Lect) %>% 
  kmeans(5) %>% 
  pluck(1) %>% 
  as_factor()

K5
##   [1] 4 1 5 5 4 2 3 5 3 1 2 2 3 2 2 5 1 1 1 5 4 5 4 2 4 2 3 4 2 4 5 4 1 5 2 3 5
##  [38] 2 4 3 4 2 1 2 3 4 5 2 5 3 4 1 4 3 1 4 4 1 2 1 3 5 4 1 3 2 1 1 3 4 2 5 2 2
##  [75] 2 4 3 1 1 4 5 3 2 1 2 4 5 1 5 2 3 4 2 2 4 1 2 2 1 5 1 3 5 5 3 3 3 3 1 1 2
## [112] 3 4 5 5 2 4 4 1 1 1 2 1 1 5 2 4 4 2 3 3 2 3 5 5 4 2 5 3 5 4 2 3 3 1 4 3 1
## [149] 2 2 3 2 4 2 5 4 1 1 2 4 2 5 3 1 5 2 2 1 2 1 5 4 2 3 2 2 5 2 4 5 5 2 4 1 2
## [186] 2 2 2 2 2 5 3 3 2 3 5 5 5 4 4 5 3 4 4 5 5 4 5 4
## Levels: 1 2 3 4 5

Cluster PhonoScale into the ten groups:

K10 <- PhonoScale %>% 
  select(-Lect) %>% 
  kmeans(10) %>% 
  pluck(1) %>% 
  `-`(1) %>% 
  as_factor()

K10
##   [1] 4 9 3 6 7 1 0 3 5 9 2 1 3 2 1 3 5 5 5 8 4 8 7 2 4 2 5 6 1 7 8 7 9 3 2 5 8
##  [38] 1 7 0 7 2 5 1 0 4 8 2 3 0 7 5 6 3 5 4 4 5 2 9 0 8 4 5 0 1 9 5 0 7 1 8 2 2
##  [75] 2 4 0 9 5 7 6 3 2 2 2 4 6 5 8 5 5 4 2 2 6 5 2 5 5 8 2 3 8 8 5 3 3 0 5 2 2
## [112] 5 7 3 8 2 4 7 9 9 5 2 5 5 8 2 7 6 2 0 3 2 3 8 8 7 2 8 0 6 7 2 1 3 5 4 3 5
## [149] 2 2 0 2 7 2 7 6 5 5 2 4 2 6 3 9 8 2 2 5 8 5 8 4 2 3 2 2 2 2 7 8 8 2 7 5 2
## [186] 2 2 1 2 2 8 3 0 2 3 6 8 6 4 4 6 3 7 7 8 8 4 3 7
## Levels: 0 1 2 3 4 5 6 7 8 9

Subset lects and their coordinates:

Lect_LonLat <- PhonoBib %>% 
  filter(Lect %in% Eurasia$Lect) %>% 
  select(Lect, lon, lat)

Lect_LonLat

Add clusters to PhonoScale:

PhonoClusters <- PhonoScale %>% 
  mutate(K2 = K2,
         K5 = K5,
         K10 = K10) %>% 
  select(Lect, K2, K5, K10) %>% 
  left_join(Lect_LonLat)

PhonoClusters

Load map data:

map <- map_data("world")

head(world)
##                                                                  
## 1 function (resolution = 5, level = 0, path, version = "latest", 
## 2     ...)                                                       
## 3 {                                                              
## 4     stopifnot(level[1] == 0)                                   
## 5     resolution = round(resolution[1])                          
## 6     stopifnot(resolution %in% 1:5)

Create a map of Eurasia:

EurasiaMap <- ggplot(map, aes(x = long, y = lat)) + 
  geom_polygon(aes(group = group),
               fill = "white", 
               color = "darkgrey", 
               size = 0.2) +
  coord_map("ortho",
            orientation = c(20, 70, 0),
            xlim = c(10, 130),
            ylim = c(0, 90)) +
  theme_void()

EurasiaMap

Assign the two clusters on the map:

PhonoK2 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K2,
                 color = K2),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = 'PhonoK2.pdf', 
            width = 6, 
            height = 4.5, 
            family = 'Times New Roman')
PhonoK2
dev.off()
## quartz_off_screen 
##                 2
PhonoK2

Assign the five clusters on the map:

PhonoK5 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K5,
                 color = K5),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = 'PhonoK5.pdf', 
            width = 6, 
            height = 4.5, 
            family = 'Times New Roman')
PhonoK5
dev.off()
## quartz_off_screen 
##                 2
PhonoK5

Assign the ten clusters on the map:

PhonoK10 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K10,
                 color = K10),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = 'PhonoK10.pdf', 
            width = 6, 
            height = 4.5, 
            family = 'Times New Roman')
PhonoK10
dev.off()
## quartz_off_screen 
##                 2
PhonoK10

Subset the first three dimensions of the multidimensional scaling and join the K-means clustering:

ThreeD <- PhonoScale %>% 
  select(Lect, V1, V2, V3) %>% 
  left_join(PhonoClusters)

head(ThreeD)

Draw a 3D plot of multidimensional scaling:

ThreeDPlot <- plot_ly(
            data = ThreeD,
            x = ~V1,
            y = ~V2,
            z = ~V3,
            color = ~K5,
            text = ~Lect,
            textfont = list(size = 15),
            type = 'scatter3d',
            mode = 'text') %>% 
  layout(showlegend = FALSE)

ThreeDPlot

Testing the correlation between phonological and geographical distances

Create a tibble of lects, coordinates, families, and a dummy column (kilometers)

Lect_Kilometers <- Lect_LonLat %>% 
  left_join(PhonoBib) %>% 
  select(Lect, lon, lat, Family) %>% 
  mutate(Kilometers = 'Kilometers')
  
Lect_Kilometers

Create Lect vs. Lect pairs:

Coordinates <- Lect_Kilometers %>% 
  full_join(Lect_Kilometers, by = 'Kilometers') %>% 
  mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
                             "vs.",
                              pmax(Lect.x, Lect.y),
                              sep = " "),
  Crossfamilial = Family.x != Family.y)
         
Coordinates

Subset Coordinates x:

Coordinates.x <- select(Coordinates, lon.x, lat.x)

Coordinates.x

Subset Coordinates y:

Coordinates.y <- select(Coordinates, lon.y, lat.y)

Coordinates.y

Calculate the geographical distances between two columns of coordinates:

GeoDist <- Coordinates %>% 
  mutate(Kilometers = 
           distHaversine(Coordinates.x, Coordinates.y) / 1000) %>% 
  filter(Crossfamilial) %>% 
  select(Lect_vs_Lect, Kilometers) %>% 
  distinct()

GeoDist

Join the phonological distances and geographical distances:

PhonoGeoDist <- left_join(GeoDist, PhonoDist, by = 'Lect_vs_Lect')

PhonoGeoDist

Linear regression between geographical distance and phonological distances:

PhonoGeoDist %>% 
  lm(formula = Distance ~ Kilometers) %>% 
  summary()
## 
## Call:
## lm(formula = Distance ~ Kilometers, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9728 -0.6582 -0.0037  0.6696  4.0808 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.395e+00  1.433e-02  306.63   <2e-16 ***
## Kilometers  1.281e-04  3.128e-06   40.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 18174 degrees of freedom
## Multiple R-squared:  0.08448,    Adjusted R-squared:  0.08443 
## F-statistic:  1677 on 1 and 18174 DF,  p-value: < 2.2e-16
PhonoGeoDist

Visualize the linear regression:

PhonoLM <- 
  ggplot(aes(x = Kilometers, y = Distance), data = PhonoGeoDist) +
  geom_point(alpha = 0.1) +
  geom_smooth(formula = y ~ x, method = 'lm', color = 'red') +
  theme_classic() +
  scale_x_continuous(name = 'Geographical distance (km)') +
  scale_y_continuous(name = 'Phonological distance (z)')

cairo_pdf(file = 'PhonoLM.pdf', 
          width = 4, 
          height = 3, 
          family = 'Times New Roman')
PhonoLM
dev.off()
## quartz_off_screen 
##                 2
PhonoLM